import sys
import numpy as np
import pandas as pd
from datatable import dt, f, by
from itables import init_notebook_mode
init_notebook_mode(all_interactive=True)
from plotly import express as px, io as pio
pd.options.plotting.backend = 'plotly'
pio.renderers.default = 'plotly_mimetype+notebook_connected'
from utils import get_path, data_load
sys.path.insert(0, '../')
from secret import API_KEYOSM vs Google Maps
Fontes de Dados
Rotas SIH
Colunas
cols_sih = {
'cod_municipio': 'origem',
'hosp_cod_municipio': 'destino',
}Carregando tabela
path_sih = get_path('SIH', 'sih.jay')
df_sih = dt.fread(path_sih, columns=cols_sih)
df_sih.names = cols_sih
df_sih = df_sih[:, list(cols_sih.values())]
df_sih = df_sih[f.origem != f.destino, :]
df_sih['count'] = 1
df_sih = df_sih[:, dt.sum(f.count), by('origem', 'destino')]
df_sih = df_sih.sort(-f.count)
df_sih| origem | destino | count | |
|---|---|---|---|
| ▪▪▪▪ | ▪▪▪▪ | ▪▪▪▪▪▪▪▪ | |
| 0 | 510840 | 510340 | 22807 |
| 1 | 315460 | 310620 | 17147 |
| 2 | 260790 | 261160 | 16012 |
| 3 | 521250 | 530010 | 15461 |
| 4 | 280480 | 280030 | 15090 |
| 5 | 330350 | 330285 | 14875 |
| 6 | 230370 | 230440 | 14545 |
| 7 | 320500 | 320530 | 14543 |
| 8 | 150080 | 150140 | 14521 |
| 9 | 432300 | 431490 | 14273 |
| 10 | 520025 | 530010 | 13577 |
| 11 | 351500 | 355280 | 13106 |
| 12 | 520140 | 520870 | 12745 |
| 13 | 410040 | 410690 | 12717 |
| 14 | 522185 | 530010 | 12112 |
| ⋮ | ⋮ | ⋮ | ⋮ |
| 68,568 | 530010 | 521308 | 1 |
| 68,569 | 530010 | 521770 | 1 |
| 68,570 | 530010 | 522000 | 1 |
| 68,571 | 530010 | 522020 | 1 |
| 68,572 | 530010 | 522045 | 1 |
Rotas OSM
Carregando tabela
path_osm = get_path('DISTANCIAS', f'deslocamentos.csv.gzip')
df_osm = pd.read_csv(path_osm)
df_osm| origem | destino | distancia (km) | tempo (min) | count |
|---|---|---|---|---|
| Loading... (need help?) |
Rotas Gmaps
Colunas
cols_gmaps = [
'origem', 'destino', 'count',
'distance (value)', 'duration (value)',
'duration_in_traffic (value)',
]Carregando tabela
path_routes = get_path('GESTANTES', f'gmaps/routes.csv.gzip')
df_files = pd.read_csv(path_routes)
df_files| status | origem | destino | origin_addresses | destination_addresses | origem_latitude | origem_longitude | destino_latitude | destino_longitude | hour | departure_time | mode | traffic_model | distance (value) | distance (text) | duration (value) | duration (text) | duration_in_traffic (value) | duration_in_traffic (text) | count |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Loading... (need help?) |
Selecionando
driving pessimistic at 6pm
status = df_files['status'] == 'OK'
mode = df_files['mode'] == 'driving'
traffic_model = df_files['traffic_model'] == 'pessimistic'
hour = df_files['hour'] == 18
df_gmaps = df_files[status & mode & traffic_model & hour]
df_gmaps = df_gmaps.sort_values(by='count', ascending=False).reset_index(drop=True)
df_gmaps = df_gmaps[cols_gmaps]
df_gmaps| origem | destino | count | distance (value) | duration (value) | duration_in_traffic (value) |
|---|---|---|---|---|---|
| Loading... (need help?) |
Gerando tabela
df_sih = df_sih.to_pandas()
df_sih_osm = pd.merge(df_sih, df_osm, on=list(df_sih.columns))
df_sih_gmaps = pd.merge(df_sih, df_gmaps, on=list(df_sih.columns))
df_rotas = pd.merge(df_sih_osm, df_sih_gmaps, on=['origem', 'destino', 'count'])
df_rotas| origem | destino | count | distancia (km) | tempo (min) | distance (value) | duration (value) | duration_in_traffic (value) | |
|---|---|---|---|---|---|---|---|---|
| Loading... (need help?) |
df_rotas['distance (value)'] = df_rotas['distance (value)'] / 1000
df_rotas['duration (value)'] = df_rotas['duration (value)'] / 60
df_rotas['duration_in_traffic (value)'] = df_rotas['duration_in_traffic (value)'] / 60
df_rotas| origem | destino | count | distancia (km) | tempo (min) | distance (value) | duration (value) | duration_in_traffic (value) | |
|---|---|---|---|---|---|---|---|---|
| Loading... (need help?) |
dist_gmaps = df_rotas['distance (value)']
dist_osm = df_rotas['distancia (km)']
dist_diff = (dist_gmaps - dist_osm) / dist_osm
df_rotas['dist_diff'] = dist_diff
tempo_gmaps = df_rotas['duration (value)']
tempo_traffic_gmaps = df_rotas['duration_in_traffic (value)']
tempo_osm = df_rotas['tempo (min)']
tempo_diff = (tempo_gmaps - tempo_osm) / tempo_osm
df_rotas['tempo_diff'] = tempo_diff
tempo_diff = (tempo_gmaps - tempo_osm) / tempo_osm
tempo_traffic_diff = (tempo_traffic_gmaps - tempo_osm) / tempo_osm
df_rotas['tempo_diff'] = tempo_diff
df_rotas['tempo_traffic_diff'] = tempo_traffic_diff
df_rotas| origem | destino | count | distancia (km) | tempo (min) | distance (value) | duration (value) | duration_in_traffic (value) | dist_diff | tempo_diff | tempo_traffic_diff | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Loading... (need help?) |
Investigação
Completude
df_routes_nan = df_sih_osm.merge(df_sih_gmaps, on=['origem', 'destino', 'count'], how='left')
df_routes_valid = df_routes_nan[~df_routes_nan['distance (value)'].isna()]
df_routes_nan = df_routes_nan[df_routes_nan['distance (value)'].isna()]
df_routes_nan| origem | destino | count | distancia (km) | tempo (min) | distance (value) | duration (value) | duration_in_traffic (value) | |
|---|---|---|---|---|---|---|---|---|
| Loading... (need help?) |
df_routes_nan[df_routes_nan['count'] > 1]| origem | destino | count | distancia (km) | tempo (min) | distance (value) | duration (value) | duration_in_traffic (value) | |
|---|---|---|---|---|---|---|---|---|
| Loading... (need help?) |
nan = df_routes_nan['count'].sum()
valid = df_routes_valid['count'].sum()Faltantes
px.bar(x=['nan', 'valid'], y=[nan, valid])px.histogram(df_routes_nan, x='count')#, range_x=[-.3, .45])Diferença entre OSM e Gmaps
Distância
Histograma
px.histogram(df_rotas, x='dist_diff', range_x=[-.3, .45])Scatter
# px.scatter(df_rotas, x='dist_diff', y=, range_x=[-.3, .45])Tempo (padrão)
Histograma
px.histogram(df_rotas, x='tempo_diff', range_x=[-.75, 1])Scatter
px.scatter(
df_rotas,
x='tempo_diff', y='distancia (km)',
opacity=.75, size='distance (value)',
log_x=True, log_y=True,
)Tempo (tráfego)
Histograma
px.histogram(df_rotas, x='tempo_traffic_diff', range_x=[-.6, 1.5])Scatter
px.scatter(
df_rotas,
x='tempo_traffic_diff', y='distancia (km)',
opacity=.75, size='distance (value)',
log_x=True, log_y=True,
)Exportando tabela
output_route = get_path('GESTANTES', 'routes.csv.gzip')
df_rotas.to_csv(output_route, index=False)pd.read_csv(output_route)| origem | destino | count | distancia (km) | tempo (min) | distance (value) | duration (value) | duration_in_traffic (value) | dist_diff | tempo_diff | tempo_traffic_diff |
|---|---|---|---|---|---|---|---|---|---|---|
| Loading... (need help?) |